# 导入所需库
import requests
import re
import csv

# url
url = "https://m.dytt8.net/index2.htm"
# User-Agent
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/100.0.4896.127 Safari/537.36 "
}
resp = requests.get(url, headers=headers, verify=False)
resp.encoding = 'gb2312'
page_context = resp.text

# 解析数据
obj1 = re.compile(r'<div class="co_content2">.*?<ul>(?P<uls>.*?)</div>', re.S)
obj2 = re.compile(r'.*?<a href=.*?/(?P<href>.*?)\'>(?P<name>.*?)</a>', re.S)
obj3 = re.compile(r'^\d', re.S)
obj4 = re.compile(r'.*/')
obj5 = re.compile(r'◎译　　名　(?P<chinese_name>.*?)<br />◎片　　名　(?P<name>.*?)<br />◎年　　代　(?P<year>.*?)'
                  r'<br />◎产　　地　(?P<origin>.*?)<br />◎类　　别　(?P<type>.*?)<br />◎语　　言　(?P<langerage>.*?)'
                  r'<br />.*?◎上映日期　(?P<date>.*?)<br />.*?◎导　　演　(?P<director>.*?)<br />')

# url处理
domain = obj4.match(url).group()

# 数据匹配
result1 = obj1.finditer(page_context)
file = open("../files/movie_heaven.csv", mode="w")
csvwriter = csv.writer(file)
child_url_list = []
# 获取子页面url链接
for it in result1:
    uls = it.group('uls')
    result2 = obj2.finditer(uls)
    for i in result2:
        name = i.group('name')
        if obj3.match(name) is not None:
            href = i.group('href')
            child_url = domain + href
            child_url_list.append(child_url)

# 子页面匹配
for h in child_url_list:
    print(h)
    child_page = requests.get(h, headers=headers, verify=False)
    child_page.encoding = 'gb2312'
    result3 = obj5.finditer(child_page.text)
    for m in result3:
        dic = m.groupdict()
        csvwriter.writerow(dic.values())
        print(dic)


file.close()
print("爬取完毕!")

# 关闭请求
resp.close()
